#lang racket/base
(provide
 (struct-out group)
 (struct-out literal)
 (struct-out sep)
 (struct-out id)
 (struct-out stx)

 lexer? port->lexer generator->lexer sequence->lexer
 lex lex-list

 lexer-literal-parsers BAD-PARSE)

(require
 threading
 racket/match
 racket/format
 racket/string
 (only-in racket/generator generator yield)
 syntax/parse/define
 (for-syntax racket/base))

;; ==========================
;; LEXER AST

;; Lexer "tokens" (still a tree tho)
(struct group   [style tokens] #:transparent)
(struct literal [value] #:transparent)
(struct sep     [str] #:transparent)
(struct id      [symbol] #:transparent)

;; Wrap additional metadata around token. This struct should be extended to attach new types of info
;; (e.g.: syntax properties, scope sets).
(struct stx [token srcpos])

;; ==========================
;; LEXER CONFIG

(define BAD-PARSE
  (let () (struct bad-parse []) (bad-parse)))

;; string -> (or exact-integer BAD-PARSE)
(define (parse-integer s)
  (match (string->number s)
    [(? exact-integer? n) n]
    [_ BAD-PARSE]))

;; [listof [string -> (or value BAD-PARSE)]]
(define lexer-literal-parsers
  (make-parameter
   (list parse-integer)))

;; ==========================
;; LEXING

(struct lx:atom [token] #:transparent)
(struct lx:open [char] #:transparent)
(struct lx:close [char] #:transparent)
(struct lx:eof [] #:transparent)

;; (reader port
;;         #:whitespace regexp-string
;;         [regexp-string capture-id re-match-body ...]
;;         ...
;;         [id ident-match-body ...]
;;         [#:eof eof-body ...])
;;
;; Returns a zero-arg generator function that reads from 'port'.
;; The generator returns 're-match-body' if one of the regexp matches the next sequence in the port
;; (with capture-id bound to the string captured by the regexp).
;; The generator returns 'ident-match-body' if no regexp matches, where 'id' is bound to the string
;; up until that point.
;; The generator returns 'eof-body' when the end of file is encountered.
(define-simple-macro (build-reader port-expr
                                   #:whitespace rx-ws-str-expr:expr
                                   [rx-str-expr:expr rx-cap:id rx-rhs ...+]
                                   ...
                                   [identifier:id ident-rhs ...+]
                                   [#:eof eof-rhs ...+])
  #:with [cap-range ...] (generate-temporaries #'[rx-cap ...])
  (let ([port port-expr]
        [rx (~> (list (string-append "(" rx-str-expr ")") ...) ;; Make each regexp a capture group
                (string-join _ "|")                            ;; Combine with |
                (format "(~a)|~a|$" rx-ws-str-expr _)          ;; Match whitespace and EOF
                pregexp)])
    (generator ()
      (let loop ()
        (match-define
          ;; Bind "capture ranges" from running the regexp on the port
          (list (cons start end)
                ws-cap-range
                cap-range
                ...)
          (regexp-match-peek-positions rx port))

        (let* ([before (bytes->string/utf-8 (read-bytes start port))]
               [matched (bytes->string/utf-8 (read-bytes (- end start) port))])

          ;; If before is non-empty, then treat that as an identifier
          (unless (zero? start)
            (yield (let ([identifier before]) ident-rhs ...)))

          (cond [ws-cap-range
                 ;; Ignore whitespace
                 (loop)]
                [cap-range
                 ;; Defer to appropriate rx-rhs if the corresponding cap-range is not #f.
                 (yield (let ([rx-cap matched]) rx-rhs ...))
                 (loop)]
                ...
                [else ;; No match so must be EOF
                 eof-rhs ...]))))))

;; (lexer [-> lx:*])
(struct lexer [reader])

;; Create a new lexer that parses from given port.
;; -> lexer
(define (port->lexer port)
  (define lit-parsers (lexer-literal-parsers))
  (lexer
   (build-reader port
                 #:whitespace "\\s+"
                 ["[([{]" s (lx:open (string-ref s 0))]
                 ["[)\\]}]" s (lx:close (string-ref s 0))]
                 ["[,:;=|.]" s (lx:atom (sep s))]

                 [ident
                  (or (for*/first ([parse (in-list lit-parsers)]
                                   [v (in-value (parse ident))]
                                   #:when (not (eq? v BAD-PARSE)))
                        (lx:atom (literal v)))
                      (lx:atom (id ident)))]

                 [#:eof
                  (lx:eof)])))


;; Create a new lexer that uses the given generator.
;; [-> lx:*] -> lexer
(define (generator->lexer gen)
  (lexer gen))

;; Create a new lexer that uses the given sequence.
;; [sequenceof lx*:] -> lexer
(define (sequence->lexer seq)
  (lexer (generator ()
           (for ([x seq]) (yield x))
           (let loop ()
             (yield (lx:eof))
             (loop)))))

;; lexer char -> char
(define (lexer-close-char _lx open-chr)
  (case open-chr
    [(#\() #\)]
    [(#\{) #\}]
    [(#\[) #\]]))

;; lexer -> (or lx:atom lx:open lx:close lx:eof)
(define (lex lx)
  ((lexer-reader lx)))

;; lexer (or char 'eof) -> [listof token]
(define (lex-list lx closing)
  (match (lex lx)
    [(lx:atom tok)
     (cons tok (lex-list lx closing))]

    [(lx:open chr)
     (define close-chr
       (lexer-close-char lx chr))
     (define tok
       (group chr (lex-list lx close-chr)))
     (cons tok (lex-list lx closing))]

    [(lx:close chr)
     (when (eq? closing 'eof)
       (error (~a "unexpected closing paren: '" chr "'")))
     (unless (eqv? chr closing)
       (error (~a "unexpected closing paren '" chr "', expected '" closing "'")))
     '()]

    [(lx:eof)
     (unless (eq? closing 'eof)
       (error (~a "did not find expected closing paren '" closing "'")))
     '()]))


(module+ test
  (require
   rackunit)

  (let ([lx (port->lexer
             (open-input-string "1 hello: (a b) {a, [b=c]}"))])
    (check-equal?
     (lex-list lx 'eof)
     (list (literal 1)
           (id "hello")
           (sep ":")
           (group #\( (list (id "a") (id "b")))
           (group #\{ (list (id "a")
                            (sep ",")
                            (group #\[ (list (id "b")
                                             (sep "=")
                                             (id "c"))))))))

  (let ([lx (sequence->lexer
             ;; "1 (a)"
             (list (lx:atom (literal 1))
                   (lx:open #\()
                   (lx:atom (id "a"))
                   (lx:close #\))))])
    (check-equal?
     (lex-list lx 'eof)
     (list (literal 1)
           (group #\( (list (id "a")))))))
